IndexSegment.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import net.nutch.pagedb.*;
import net.nutch.linkdb.*;
import net.nutch.fetcher.*;
import net.nutch.analysis.NutchDocumentAnalyzer;
import net.nutch.db.*;
import net.nutch.io.*;
import net.nutch.util.*;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.Date;
import java.io.File;
import java.io.EOFException;

/** Creates an index for the output corresponding to a single fetcher run. */
public class IndexSegment {
  public static final String DONE_NAME = "index.done";
  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.index.IndexSegment");

  private boolean boostByLinkCount =
    NutchConf.getBoolean("indexer.boost.by.link.count", false);

  private float scorePower = NutchConf.getFloat("indexer.score.power", 0.5f);

  private int maxTitleLength =
    NutchConf.getInt("indexer.max.title.length", 100);

  private File directory = null;
  private int maxDocs = Integer.MAX_VALUE;

  /** Determines the power of link analyis scores.  Each pages's boost is
   * set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link
   * analysis score and <i>scorePower</i> is the value passed to this method.
   */
  public void setScorePower(float power) { scorePower = power; }

  private void indexPages() throws Exception {
    IndexWriter writer
      = new IndexWriter(new File(directory, "index"),
                        new NutchDocumentAnalyzer(), true);
    writer.mergeFactor = 50;
    writer.minMergeDocs = 50;
    writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
    writer.setUseCompoundFile(false);
    writer.setSimilarity(new NutchSimilarity());

    ArrayFile.Reader fetcher = null;
    ArrayFile.Reader text = null;

    int count = 0;
    try {
      fetcher = new ArrayFile.Reader(new File(directory, FetcherOutput.DIR_NAME).toString());
      text = new ArrayFile.Reader(new File(directory,FetcherText.DIR_NAME).toString());

      String segmentName = directory.getCanonicalFile().getName();
      FetcherOutput fetcherOutput = new FetcherOutput();
      FetcherText fetcherText = new FetcherText();

      while (fetcher.next(fetcherOutput) != null && count++ < maxDocs) {
        text.next(fetcherText);
        
        if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS)
          continue;                               // don't index the page

        Document doc = makeDocument(segmentName, fetcher.key(),
                                    fetcherOutput, fetcherText);
        writer.addDocument(doc);
      }
    } catch (EOFException e) {
      LOG.warning("Unexpected EOF in: " + directory +
                  " at entry #" + count + ".  Ignoring.");
    } finally {
      if (fetcher != null)
        fetcher.close();
      if (text != null)
        text.close();
    }
    LOG.info("Optimizing index...");
    writer.optimize();
    writer.close();
  }

  private Document makeDocument(String segmentName, long docNo,
                                FetcherOutput fetcherOutput,
                                FetcherText fetcherText)
    throws Exception {

    FetchListEntry fle = fetcherOutput.getFetchListEntry();
    String url = fle.getPage().getURL().toString();
    String title = fetcherOutput.getTitle();

    if (title.length() > maxTitleLength) {        // truncate title if needed
      title = title.substring(0, maxTitleLength);
    }

    Document doc = new Document();

    // url is both stored and indexed, so it's both searchable and returned
    doc.add(Field.Text("url", url));

    // un-indexed fields: not searchable, but in hits and/or used by dedup
    doc.add(Field.UnIndexed("title", title));
    doc.add(Field.UnIndexed("digest", fetcherOutput.getMD5Hash().toString()));
    doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
    doc.add(Field.UnIndexed("segment", segmentName));

    // content is indexed, so that it's searchable, but not stored in index
    doc.add(Field.UnStored("content", fetcherText.getText()));
    
    // anchors are indexed, so they're searchable, but not stored in index
    String[] anchors = fle.getAnchors();
    for (int i = 0; i < anchors.length; i++) {
      doc.add(Field.UnStored("anchor", anchors[i]));
    }

    // add title as anchor so it's searchable.  doesn't warrant its own field.
    doc.add(Field.UnStored("anchor", title));

    // compute boost
    // 1. Start with page's score from DB -- 1.0 if no link analysis.
    float boost = fle.getPage().getScore();
    // 2. Apply scorePower to this.
    boost = (float)Math.pow(boost, scorePower);
    // 3. Optionally boost by log of incoming anchor count.
    if (boostByLinkCount)
      boost *= (float)Math.log(Math.E + anchors.length);
    // 4. Apply boost to all indexed fields.
    doc.setBoost(boost);

    // store boost for use by explain and dedup
    doc.add(Field.UnIndexed("boost", Float.toString(boost)));
    
    return doc;
  }


  /** Create an index for the input files in the named directory. */
  public static void main(String[] args) throws Exception {
      
    String usage = "IndexSegment <segment_directory>";

    if (args.length == 0) {
      System.err.println("Usage: " + usage);
      return;
    }

    IndexSegment indexer = new IndexSegment();

    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-max")) {        // parse -max option
        indexer.maxDocs = Integer.parseInt(args[++i]);
      } else if (i != args.length-1) {
        System.err.println("Usage: " + usage);
        return;
      } else {
        indexer.directory = new File(args[i]);
      }
    }

//     File fetcherDone = new File(indexer.directory, FetcherOutput.DONE_NAME);
//     if (!fetcherDone.exists())                    // check fetcher done file
//       throw new RuntimeException("can't index--not yet fetched: " +
//                                  fetcherDone + " does not exist");

    File doneFile = new File(indexer.directory, DONE_NAME);
    if (doneFile.exists())                        // check index done file
      throw new RuntimeException("already indexed: " + doneFile + " exists");

    LOG.info("indexing segment: " + indexer.directory);

    indexer.indexPages();
    doneFile.createNewFile();                     // create the done file

    LOG.info("done indexing");
  }

}